library("tidyverse")
library("here")
library("assertthat")
library("tidymodels")
library("xgboost")
library("tictoc")
library("gt")
library("vip")
library("futile.logger")
tidymodels_prefer()
conflicted::conflict_prefer("spec", "yardstick")
conflicted::conflict_prefer("discard", "purrr")
source(here("src", "01-constants.R"))
cores <- parallel::detectCores()
doParallel::registerDoParallel()
## NULL
## NULL
Dataset used: real
Dataset path: /Users/sebastiansaueruser/Google Drive/research/Covid19/covid-icu-old/data/processed/data-prepared.csv
stopifnot(file.exists(data_processed_path))
d <- read_csv(data_processed_path)
flog.info("Processed data was read.")
flog.info(paste0("Dimensions of data set read are: ", str_c(dim(d), collapse = ", ")))
flog.info(paste0("Data path: ", data_processed_path))
Dimensions of the data set:
dim(d)
## [1] 665 65
Define DV as factor, since we are modelling a classification setting:
d <-
d %>%
mutate(verlauf = factor(verlauf))
List of columns with missing values
d %>%
map_dfr( ~ sum(is.na(.))) %>%
pivot_longer(everything()) %>%
arrange(-value) %>%
filter(value > 0) %>%
gt()
| name | value |
|---|---|
| ro | 55 |
| neutros_nl | 38 |
| lymphos_nl | 38 |
| nlr | 38 |
| ven_laktat_mmol_l | 37 |
| ven_bga_p_co2_mm_hg | 35 |
| vbga_p_o2_mm_hg | 35 |
| procalcitonin_ng_m_l | 32 |
| bg | 25 |
| bilirubin_ges_mg_d_l | 21 |
| alt_u_l | 16 |
| ast_u_l | 15 |
| ldh_u_l | 13 |
| q_sofa | 10 |
| harnstoff_mg_dl | 9 |
| af | 8 |
| quick_percent | 8 |
| esi | 3 |
| e_gfr_ml_min | 3 |
| hf | 2 |
| bp_dia | 2 |
| temp | 2 |
| thrombos_nl | 2 |
| bp_sys | 1 |
| gcs | 1 |
| supportive_o2 | 1 |
| leukos_nl | 1 |
| crea_mg_dl | 1 |
| akutes_nierenversagen | 1 |
Number of observations when all NAs (in all columns) are removed:
d %>%
drop_na() %>%
nrow()
## [1] 511
set.seed(42)
d_split <- initial_split(d, prop = .8, strata = verlauf)
d_split
## <Analysis/Assess/Total>
## <531/134/665>
d_train <- training(d_split)
d_test <- training(d_split)
d_train %>%
count(verlauf) %>%
mutate(verlauf_prop = n/sum(n)) %>%
gt() %>%
fmt_number(where(is.numeric))
| verlauf | n | verlauf_prop |
|---|---|---|
| bad | 109.00 | 0.21 |
| good | 422.00 | 0.79 |
Hence, the Accuracy performance measure in this model is the larger value of the proportion.
xgb_mod <-
boost_tree(
trees = 1000,
tree_depth = tune(),
min_n = tune(),
loss_reduction = tune(), ## first three: model complexity
sample_size = tune(), mtry = tune(), ## randomness
learn_rate = tune(), ## step size
) %>%
set_engine("xgboost") %>%
set_mode("classification")
rf_mod <-
rand_forest(mtry = tune(),
min_n = tune(),
trees = 1000) %>%
set_engine("ranger",
num.threads = cores,
importance = "permutation") %>%
set_mode("classification")
logistic_mod <-
logistic_reg() # engine is set by default
10 folds, 3 repeats.
folds_spec <-
vfold_cv(d_train, repeats = 3)
basic_recipe <-
recipe(verlauf ~ ., data = d_train) %>%
update_role(id, new_role = "ID") %>%
update_role(all_of(forbidden_vars), new_role = "ID") %>%
step_zv(all_numeric(), -all_outcomes()) %>%
step_normalize(all_numeric(), -all_outcomes()) %>%
step_corr(all_predictors(), threshold = 0.7, method = "spearman") %>%
step_impute_knn(all_predictors())
xgb_wf <-
workflow() %>%
add_model(xgb_mod) %>%
add_recipe(basic_recipe)
rf_wf <-
workflow() %>%
add_model(rf_mod) %>%
add_recipe(basic_recipe)
rf_wf
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 4 Recipe Steps
##
## • step_zv()
## • step_normalize()
## • step_corr()
## • step_impute_knn()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Random Forest Model Specification (classification)
##
## Main Arguments:
## mtry = tune()
## trees = 1000
## min_n = tune()
##
## Engine-Specific Arguments:
## num.threads = cores
## importance = permutation
##
## Computational engine: ranger
logistic_wf <-
workflow() %>%
add_model(logistic_mod) %>%
add_recipe(basic_recipe)
class_metrics <-
metric_set(
recall,
precision,
f_meas,
accuracy,
kap,
roc_auc,
sens,
spec,
ppv
)
xgb_grid <-
grid_latin_hypercube(
tree_depth(),
min_n(),
loss_reduction(),
sample_size = sample_prop(),
finalize(mtry(), d_train),
learn_rate(),
size = 30
)
doParallel::registerDoParallel()
if (rerun_all == FALSE & file.exists(paste0(here::here(xgb01_outputfile)))){
xgb_res <- read_rds(file = xgb01_outputfile)
} else {
tic()
cat("Computing XGB with tuning grid\n")
xgb_res <-
xgb_wf %>%
tune_grid(
xgb_wf,
resamples = folds_spec,
metrics = metric_set(
recall,
precision,
f_meas,
accuracy,
kap,
roc_auc,
sens,
spec,
ppv
),
grid = xgb_grid,
control = control_grid(save_pred = TRUE)
)
toc()
write_rds(xgb_res, file = xgb01_outputfile)
}
rf_mod
## Random Forest Model Specification (classification)
##
## Main Arguments:
## mtry = tune()
## trees = 1000
## min_n = tune()
##
## Engine-Specific Arguments:
## num.threads = cores
## importance = permutation
##
## Computational engine: ranger
if (rerun_all == FALSE & file.exists(paste0(here::here(rf01_outputfile)))){
flog.info("Reading RDS file for Random Forest 01 model.")
rf_res <- read_rds(file = rf01_outputfile)
} else {
flog.info("Computing Random Forest 01 model.")
tic()
cat("Computing RF with tuning grid\n")
rf_res <-
rf_wf %>%
tune_grid(
resamples = folds_spec,
grid = 25,
control = control_grid(save_pred = TRUE),
metrics = metric_set(
recall,
precision,
f_meas,
accuracy,
kap,
roc_auc,
sens,
spec,
ppv
))
toc()
write_rds(rf_res, file = rf01_outputfile)
}
logistic_res <-
logistic_wf %>%
fit_resamples(
resamples = folds_spec,
control = control_grid(save_pred = TRUE),
metrics = metric_set(
recall,
precision,
f_meas,
accuracy,
kap,
roc_auc,
sens,
spec,
ppv
))
write_rds(logistic_res, file = logistic01_outputfile)
First, let’s check the performance in the resamples and see which model candidates performed best.
Let’s have a look at the results:
xgb_res %>%
collect_metrics(summarize = TRUE) %>%
slice_head(n = 10) %>%
gt() %>%
fmt_number(where(is.numeric))
| mtry | min_n | tree_depth | learn_rate | loss_reduction | sample_size | .metric | .estimator | mean | n | std_err | .config |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 5.00 | 18.00 | 15.00 | 0.00 | 0.00 | 0.42 | accuracy | binary | 0.79 | 30.00 | 0.01 | Preprocessor1_Model01 |
| 5.00 | 18.00 | 15.00 | 0.00 | 0.00 | 0.42 | f_meas | binary | NaN | 0.00 | NA | Preprocessor1_Model01 |
| 5.00 | 18.00 | 15.00 | 0.00 | 0.00 | 0.42 | kap | binary | 0.00 | 30.00 | 0.00 | Preprocessor1_Model01 |
| 5.00 | 18.00 | 15.00 | 0.00 | 0.00 | 0.42 | ppv | binary | NaN | 0.00 | NA | Preprocessor1_Model01 |
| 5.00 | 18.00 | 15.00 | 0.00 | 0.00 | 0.42 | precision | binary | NaN | 0.00 | NA | Preprocessor1_Model01 |
| 5.00 | 18.00 | 15.00 | 0.00 | 0.00 | 0.42 | recall | binary | 0.00 | 30.00 | 0.00 | Preprocessor1_Model01 |
| 5.00 | 18.00 | 15.00 | 0.00 | 0.00 | 0.42 | roc_auc | binary | 0.72 | 30.00 | 0.02 | Preprocessor1_Model01 |
| 5.00 | 18.00 | 15.00 | 0.00 | 0.00 | 0.42 | sens | binary | 0.00 | 30.00 | 0.00 | Preprocessor1_Model01 |
| 5.00 | 18.00 | 15.00 | 0.00 | 0.00 | 0.42 | spec | binary | 1.00 | 30.00 | 0.00 | Preprocessor1_Model01 |
| 39.00 | 6.00 | 5.00 | 0.00 | 0.00 | 0.71 | accuracy | binary | 0.83 | 30.00 | 0.01 | Preprocessor1_Model02 |
autoplot(xgb_res)
if (write_to_disk) ggsave(filename = paste0(figs_prefix_path, "xgb_res_cv.pdf"),
width = 10, height = 10)
show_best(xgb_res)
## # A tibble: 5 × 12
## mtry min_n tree_depth learn_rate loss_reduction sample_size .metric
## <int> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 20 35 5 1.33e-10 0.515 0.347 recall
## 2 36 37 8 7.10e- 7 0.00000000765 0.137 recall
## 3 61 17 4 3.15e- 2 0.0196 0.757 recall
## 4 51 9 11 1.06e- 2 0.000000164 0.904 recall
## 5 13 5 14 1.44e- 2 0.0000000270 0.436 recall
## # … with 5 more variables: .estimator <chr>, mean <dbl>, n <int>,
## # std_err <dbl>, .config <chr>
rf_res %>%
collect_metrics(summarize = TRUE) %>%
slice_head(n = 10) %>%
gt() %>%
fmt_number(where(is.numeric))
| mtry | min_n | .metric | .estimator | mean | n | std_err | .config |
|---|---|---|---|---|---|---|---|
| 25.00 | 29.00 | accuracy | binary | 0.66 | 30.00 | 0.06 | Preprocessor1_Model01 |
| 25.00 | 29.00 | f_meas | binary | NaN | 0.00 | NA | Preprocessor1_Model01 |
| 25.00 | 29.00 | kap | binary | 0.00 | 19.00 | 0.00 | Preprocessor1_Model01 |
| 25.00 | 29.00 | ppv | binary | NaN | 0.00 | NA | Preprocessor1_Model01 |
| 25.00 | 29.00 | precision | binary | NaN | 0.00 | NA | Preprocessor1_Model01 |
| 25.00 | 29.00 | recall | binary | 0.00 | 19.00 | 0.00 | Preprocessor1_Model01 |
| 25.00 | 29.00 | roc_auc | binary | 0.50 | 17.00 | 0.00 | Preprocessor1_Model01 |
| 25.00 | 29.00 | sens | binary | 0.00 | 19.00 | 0.00 | Preprocessor1_Model01 |
| 25.00 | 29.00 | spec | binary | 1.00 | 28.00 | 0.00 | Preprocessor1_Model01 |
| 28.00 | 36.00 | accuracy | binary | 0.66 | 30.00 | 0.06 | Preprocessor1_Model02 |
autoplot(rf_res)
if (write_to_disk) ggsave(filename = paste0(figs_prefix_path, "rf_res_cv.pdf"),
width = 10, height = 10)
show_best(rf_res)
## # A tibble: 5 × 8
## mtry min_n .metric .estimator mean n std_err .config
## <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 12 2 recall binary 0.158 19 0.0770 Preprocessor1_Model20
## 2 32 11 recall binary 0.132 19 0.0749 Preprocessor1_Model10
## 3 13 5 recall binary 0.132 19 0.0749 Preprocessor1_Model23
## 4 10 9 recall binary 0.105 19 0.0723 Preprocessor1_Model17
## 5 6 7 recall binary 0.105 19 0.0723 Preprocessor1_Model22
logistic_res %>%
collect_metrics(summarize = TRUE) %>%
gt() %>%
fmt_number(where(is.numeric))
| .metric | .estimator | mean | n | std_err | .config |
|---|---|---|---|---|---|
| accuracy | binary | 0.79 | 30.00 | 0.01 | Preprocessor1_Model1 |
| f_meas | binary | 0.38 | 30.00 | 0.02 | Preprocessor1_Model1 |
| kap | binary | 0.27 | 30.00 | 0.02 | Preprocessor1_Model1 |
| ppv | binary | 0.50 | 30.00 | 0.03 | Preprocessor1_Model1 |
| precision | binary | 0.50 | 30.00 | 0.03 | Preprocessor1_Model1 |
| recall | binary | 0.33 | 30.00 | 0.02 | Preprocessor1_Model1 |
| roc_auc | binary | 0.73 | 30.00 | 0.02 | Preprocessor1_Model1 |
| sens | binary | 0.33 | 30.00 | 0.02 | Preprocessor1_Model1 |
| spec | binary | 0.92 | 30.00 | 0.01 | Preprocessor1_Model1 |
show_best(logistic_res)
## # A tibble: 1 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 recall binary 0.327 30 0.0210 Preprocessor1_Model1
show_best(rf_res, metric = "recall")
## # A tibble: 5 × 8
## mtry min_n .metric .estimator mean n std_err .config
## <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 12 2 recall binary 0.158 19 0.0770 Preprocessor1_Model20
## 2 32 11 recall binary 0.132 19 0.0749 Preprocessor1_Model10
## 3 13 5 recall binary 0.132 19 0.0749 Preprocessor1_Model23
## 4 10 9 recall binary 0.105 19 0.0723 Preprocessor1_Model17
## 5 6 7 recall binary 0.105 19 0.0723 Preprocessor1_Model22
select_best(rf_res, metric = "recall")
## # A tibble: 1 × 3
## mtry min_n .config
## <int> <int> <chr>
## 1 12 2 Preprocessor1_Model20
rf_best_params <-
tibble(
mtry = select_best(rf_res, metric = "recall")$mtry[1],
min_n = select_best(rf_res, metric = "recall")$min_n[1]
)
fin_rf_wf <-
rf_wf %>%
finalize_workflow(rf_best_params)
fin_rf_wf
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 4 Recipe Steps
##
## • step_zv()
## • step_normalize()
## • step_corr()
## • step_impute_knn()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Random Forest Model Specification (classification)
##
## Main Arguments:
## mtry = 12
## trees = 1000
## min_n = 2
##
## Engine-Specific Arguments:
## num.threads = cores
## importance = permutation
##
## Computational engine: ranger
final_rf_fit <-
fin_rf_wf %>%
last_fit(d_split,
metrics = metric_set(
recall,
precision,
f_meas,
accuracy,
kap,
roc_auc,
sens,
spec,
ppv)
)
final_rf_fit
## # Resampling results
## # Manual resampling
## # A tibble: 1 × 6
## splits id .metrics .notes .predictions .workflow
## <list> <chr> <list> <list> <list> <list>
## 1 <split [531/134]> train/test split <tibble> <tibble> <tibble> <workflow>
Check the notes:
final_rf_fit %>% pluck(".notes", 1) %>% pull(3)
## character(0)
if (write_to_disk == TRUE)
write_rds(final_rf_fit,
file = final_rf_fit_file)
final_fit_metrics_rf <-
final_rf_fit %>%
collect_metrics()
final_fit_metrics_rf
## # A tibble: 9 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 recall binary 0.25 Preprocessor1_Model1
## 2 precision binary 0.7 Preprocessor1_Model1
## 3 f_meas binary 0.368 Preprocessor1_Model1
## 4 accuracy binary 0.821 Preprocessor1_Model1
## 5 kap binary 0.290 Preprocessor1_Model1
## 6 sens binary 0.25 Preprocessor1_Model1
## 7 spec binary 0.972 Preprocessor1_Model1
## 8 ppv binary 0.7 Preprocessor1_Model1
## 9 roc_auc binary 0.816 Preprocessor1_Model1
final_fit_metrics_rf %>%
select(-.config) %>%
gt() %>%
fmt_number(where(is.numeric))
| .metric | .estimator | .estimate |
|---|---|---|
| recall | binary | 0.25 |
| precision | binary | 0.70 |
| f_meas | binary | 0.37 |
| accuracy | binary | 0.82 |
| kap | binary | 0.29 |
| sens | binary | 0.25 |
| spec | binary | 0.97 |
| ppv | binary | 0.70 |
| roc_auc | binary | 0.82 |
final_rf_fit %>%
pluck(".workflow", 1) %>%
extract_fit_parsnip() %>%
vip(num_features = 20)
d %>%
count(verlauf) %>%
mutate(prop = n/sum(n)) %>%
gt() %>%
fmt_number(3)
| verlauf | n | prop |
|---|---|---|
| bad | 137 | 0.21 |
| good | 528 | 0.79 |